Principal Component Analysis

Imports


In [8]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

Data


In [9]:
iris = load_iris()
X = iris.data
y = iris.target

PCA


In [19]:
def pca_2_components(X):
    X_std = StandardScaler().fit_transform(X)
    mean_vec = np.mean(X_std, axis=0)
    cov_mat = np.cov(X_std.T)
    eig_vals, eig_vecs = np.linalg.eig(cov_mat)
    # Make a list of (eigenvalue, eigenvector) tuples
    eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]
    # Sort the (eigenvalue, eigenvector) tuples from high to low
    eig_pairs.sort()
    eig_pairs.reverse()
    matrix_w = np.hstack((eig_pairs[0][1].reshape(4,1), 
                          eig_pairs[1][1].reshape(4,1)))
    Y = X_std.dot(matrix_w)
    return Y

In [21]:
X_new = pca_2_components(X)